Human OOPS data CONTENT

  1. Aggregate into master.protein and QC for MCF10A Keep NC proteins represented in CL Find categories of the NC-only proteins Find intersection, highly confident RBP threshold. Aggregate into master.protein and QC for U20S Keep NC proteins represented in CL Find categories of the NC-only proteins Classify U20S into seen once, twice or three-times Find intersection, highly confident RBP threshold.

1*. Intersection between OOPS U20S data 1**. Intersection between Veronica’s SELEX data Not useful: SELEX is relative, cannot conduct enrichment analysis.

  1. Attempt for RBP proteins
library(dplyr)

Attaching package: ‘dplyr’

The following object is masked from ‘package:UniProt.ws’:

    select

The following objects are masked from ‘package:BiocGenerics’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(dplyr)
library(tidyr)

Attaching package: ‘tidyr’

The following object is masked from ‘package:RCurl’:

    complete
  1. Aggregate MCF10A peptides into protein identification
peptide_df <- read.table("../raw/MCF10A_sw_only_crap_checked.txt", sep="\t", header=T)
print(head(peptide_df))
print(dim(peptide_df))
[1] 101184     71
print(unique)
function (x, incomparables = FALSE, ...) 
standardGeneric("unique")
<environment: 0x10bf20668>
attr(,"generic")
[1] "unique"
attr(,"generic")attr(,"package")
[1] "BiocGenerics"
attr(,"package")
[1] "BiocGenerics"
attr(,"group")
list()
attr(,"valueClass")
character(0)
attr(,"signature")
[1] "x"
attr(,"default")
Method Definition (Class "derivedDefaultMethod"):

function (x, incomparables = FALSE, ...) 
UseMethod("unique")
<bytecode: 0x10bf1dcb0>
<environment: namespace:base>

Signatures:
        x    
target  "ANY"
defined "ANY"
attr(,"skeleton")
(function (x, incomparables = FALSE, ...) 
UseMethod("unique"))(x, incomparables, ...)
attr(,"class")
[1] "standardGeneric"
attr(,"class")attr(,"package")
[1] "methods"
print(table(peptide_df$filename))

                     filename MCF10A_CL_1_PeptideGroups.txt MCF10A_CL_2_PeptideGroups.txt 
                            9                         13450                         14412 
MCF10A_CL_3_PeptideGroups.txt MCF10A_CL_4_PeptideGroups.txt MCF10A_CL_5_PeptideGroups.txt 
                        15411                         24755                         15683 
MCF10A_NC_1_PeptideGroups.txt MCF10A_NC_2_PeptideGroups.txt MCF10A_NC_3_PeptideGroups.txt 
                         2586                          2330                          3054 
MCF10A_NC_4_PeptideGroups.txt MCF10A_NC_5_PeptideGroups.txt 
                         3942                          5552 
cat("Tally of peptides at each stage:\n")
Tally of peptides at each stage:
cat(sprintf("%s\tAll peptides with a PSM\n", length(rownames(peptide_df))))
101184  All peptides with a PSM
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))
These peptides are associated with 2575 master proteins
#print(table(peptide_df$Quan.Info))
#peptide_df <- peptide_df[peptide_df$Quan.Info=="Unique",]
#cat(sprintf("%s\tExcluding peptides without 'Quan.Info'='Unique'\n", length(rownames(peptide_df))))
peptide_df <- peptide_df[peptide_df$master_protein!="",]
cat(sprintf("%s\tExcluding peptides without a master protein\n", length(rownames(peptide_df))))
89786   Excluding peptides without a master protein
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))
These peptides are associated with 2574 master proteins
peptide_df <- peptide_df[peptide_df$unique==1,]
cat(sprintf("%s\tExcluding peptides without a unique master protein\n", length(rownames(peptide_df))))
84570   Excluding peptides without a unique master protein
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))
These peptides are associated with 2251 master proteins
peptide_df <- peptide_df[peptide_df$crap_protein==0,]
cat(sprintf("%s\tExcluding peptides matching a cRAP protein\n", length(rownames(peptide_df))))
83841   Excluding peptides matching a cRAP protein
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))
These peptides are associated with 2230 master proteins
peptide_df <- peptide_df[peptide_df$associated_crap_protein==0,]
cat(sprintf("%s\tExcluding peptides associated with a cRAP protein\n", length(rownames(peptide_df))))
81098   Excluding peptides associated with a cRAP protein
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))
These peptides are associated with 2199 master proteins
print(head(peptide_df))
protein_df <- peptide_df[,c("master_protein", "filename")]
print(dim(protein_df))
[1] 81098     2
protein_df <- protein_df %>% distinct
print(dim(protein_df))
[1] 10997     2
protein_df <- protein_df %>% separate(col=filename, into=c("_", "treatment", "rep"), sep="_")
Too many values at 10997 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
print(table(protein_df$rep, protein_df$treatment))
   
      CL   NC
  1 1492  416
  2 1622  395
  3 1609  518
  4 2013  567
  5 1627  738

Intersection CL to NC

head(protein_df)
protein_df_NC <- protein_df[protein_df$treatment=='NC',]
length((unique(protein_df_NC$master_protein)))
[1] 863
protein_df_CL <- protein_df[protein_df$treatment=='CL',]
length(unique(protein_df_CL$master_protein))
[1] 2169
common <- intersect(protein_df_NC$master_protein, protein_df_CL$master_protein) 
# There are 30 masterproteins in NC not in CL 
venn <- draw.pairwise.venn(area1 =length(unique(protein_df_CL$master_protein)), area2 = length(unique(protein_df_NC$master_protein)), cross.area = length(common), category = c("CL", "NC"),    fill = c("blue", "red"))
grid.draw(venn)
grid.newpage()

grid.arrange(gTree(children=venn), top="Master_proteins", bottom="30 non-overlapping in the NC data")
Error: could not find function "grid.arrange"

1* Intersect with the previous U20S data

1** MCF10A versus Veronica analysis

#Upload Veronica's data
peptide <- read.table("/Users/ciromonti/Documents/GitHub/ThreeTs/proteomics/trizol/MCF10A/raw/Veronica_TMT_10_Plex_F_PeptideGroups_sw_only_crap_checked.txt", sep="\t", header=T)
#change directory un RStudio to make data more readily accesible 
print(head(peptide))
print(dim(peptide))
print(colnames(peptide))
#QC of the dataset before removing the Master.Proteins.
#Can use sprintf to print the the number of peptides lost at each step. 
#Remove the cRAP proteins
peptide <- peptide  %>% filter(peptide$crap_protein == 0)
#Remove the cRAP associated proteins
peptide <- peptide %>% filter(peptide$associated_crap_protein == 0)
#Keep proteins associted to a unique master protein
peptide <- peptide %>% filter(peptide$unique == 1)
dim(peptide)

peptide <- peptide[, c("Sequence", "master_protein")]
length(peptide$master_protein)
peptide <- data.frame(unique(peptide$master_protein))
#This is the proteomics background for MCF10A. Caveat these are not baseline conditions: insulin-starved or insulin-enriched. 
OOPs_peptide <- OOPs_peptide[, c("Sequence", "master_protein")]
OOPs_peptide <- OOPs_peptide[, c("Sequence", "master_protein")]
length(OOPs_peptide$master_protein)
[1] 81098
OOPs_peptide <- OOPs_peptide[, c("Sequence", "master_protein")]
length(OOPs_peptide$master_protein)
[1] 81098
OOPs_peptide <- data.frame(unique(OOPs_peptide$master_protein))
#Intersection between the datasets 
overlap <- intersect(peptide$unique.peptide.master_protein., OOPs_peptide$unique.OOPs_peptide.master_protein.)  
length(overlap)
venn <- draw.pairwise.venn(area1 =length(peptide$unique.peptide.master_protein.), area2 = length(OOPs_peptide$unique.OOPs_peptide.master_protein.), cross.area = length(overlap), category = c("total", "OOPS"),    fill = c("blue", "red"))
grid.draw(venn)
grid.newpage()
  1. RBP function analysis following BECKMANN et al 2015 methods: GO terms contain following information: ‘mRNA’ ‘splic’ ‘RNA binding’ ‘RNA’ ‘RNP’ ‘translation’ ‘ribosom’ ‘nuclease’ ‘exosome
#Finding prosite binding domains 
#Trying to find the glycoprotein enrichment with features component of UniProt.ws
#Only 263/2199 have a "glyco" feature, need to proceed with API. 
test <- select(up,
              keys = OOPs_peptide$unique.OOPs_peptide.master_protein., 
              columns = c("FEATURES", "SUBCELLULAR-LOCATIONS"), 
              keystyle = "UNIPROTKB") 
Getting extra data for P15144,O95218,Q13428,P07384,P15924,P67809
Getting extra data for O43251,P98160,Q13155,P62917,P07996,Q9UNX3
Getting extra data for Q9Y3T9,P06748,Q9Y2Z0,P78362,P27348,Q93052
Getting extra data for Q13206,Q9Y6G3,P62861,Q12959,Q9H501,O75306
Getting extra data for P10768,Q9NQW6,P15153,O14686,Q8IXB1,O14639
Getting extra data for Q9NNW7,Q13423,Q9NRY4,Q13535,O75629,O15084
'select()' returned 1:1 mapping between keys and columns

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

---
title: "R Notebook"
output:
  pdf_document: default
  html_notebook: default
---

Human OOPS data CONTENT

1. Aggregate into master.protein and QC for MCF10A
Keep NC proteins represented in CL 
Find categories of the NC-only proteins
Find intersection, highly confident RBP threshold.
Aggregate into master.protein and QC for U20S
Keep NC proteins represented in CL 
Find categories of the NC-only proteins
Classify U20S into seen once, twice or three-times
Find intersection, highly confident RBP threshold.

1*. Intersection between OOPS U20S data
1**. Intersection between Veronica's SELEX data
Not useful: SELEX is relative, cannot conduct enrichment analysis.

2. Attempt for RBP proteins


```{r}
library(dplyr)
library(tidyr)
library(VennDiagram)
library(grid)
library(data.table)
library(base)
library(mygene)
library(reshape2)
source("https://bioconductor.org/biocLite.R")
biocLite("UniProt.ws")
library(UniProt.ws)
biocLite("biomaRt")
library(biomaRt)
```

1. Aggregate MCF10A peptides into protein identification

```{r}
peptide_df <- read.table("../raw/MCF10A_sw_only_crap_checked.txt", sep="\t", header=T)
print(head(peptide_df))
print(dim(peptide_df))
print(unique)
print(table(peptide_df$filename))
```
```{r}
cat("Tally of peptides at each stage:\n")
cat(sprintf("%s\tAll peptides with a PSM\n", length(rownames(peptide_df))))
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))

#print(table(peptide_df$Quan.Info))
#peptide_df <- peptide_df[peptide_df$Quan.Info=="Unique",]
#cat(sprintf("%s\tExcluding peptides without 'Quan.Info'='Unique'\n", length(rownames(peptide_df))))

peptide_df <- peptide_df[peptide_df$master_protein!="",]
cat(sprintf("%s\tExcluding peptides without a master protein\n", length(rownames(peptide_df))))
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))

peptide_df <- peptide_df[peptide_df$unique==1,]
cat(sprintf("%s\tExcluding peptides without a unique master protein\n", length(rownames(peptide_df))))
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))

peptide_df <- peptide_df[peptide_df$crap_protein==0,]
cat(sprintf("%s\tExcluding peptides matching a cRAP protein\n", length(rownames(peptide_df))))
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))

peptide_df <- peptide_df[peptide_df$associated_crap_protein==0,]
cat(sprintf("%s\tExcluding peptides associated with a cRAP protein\n", length(rownames(peptide_df))))
cat(sprintf("These peptides are associated with %s master proteins\n",
            length(unique(peptide_df$master_protein))))


```

```{r}
print(head(peptide_df))
protein_df <- peptide_df[,c("master_protein", "filename")]
print(dim(protein_df))
protein_df <- protein_df %>% distinct
print(dim(protein_df))
protein_df <- protein_df %>% separate(col=filename, into=c("_", "treatment", "rep"), sep="_")
print(table(protein_df$rep, protein_df$treatment))
```

Intersection CL to NC

```{r}
head(protein_df)
protein_df_NC <- protein_df[protein_df$treatment=='NC',]
length((unique(protein_df_NC$master_protein)))

protein_df_CL <- protein_df[protein_df$treatment=='CL',]
length(unique(protein_df_CL$master_protein))

common <- intersect(protein_df_NC$master_protein, protein_df_CL$master_protein) 

# There are 30 masterproteins in NC not in CL 

venn <- draw.pairwise.venn(area1 =length(unique(protein_df_CL$master_protein)), area2 = length(unique(protein_df_NC$master_protein)), cross.area = length(common), category = c("CL", "NC"), 	fill = c("blue", "red"))
grid.draw(venn)
grid.newpage()
grid.arrange(gTree(children=venn), top="Master_proteins", bottom="30 non-overlapping in the NC data")

# Find the IDs of these 30
rem_data_master <- setdiff(protein_df_NC$master_protein, protein_df_CL$master_protein)

#To continue with analysis of which proteins these are need a proteomics background.

```


1* Intersect with the previous U20S data
```{r}
PREPS <- readRDS("../../integration/notebooks/agg_geiger_max.rds")
print(head(PREPS))

PREPS_all <- PREPS  %>% filter(PREPS$seen == TRUE)
PREPS_one <- PREPS  %>% filter(PREPS$seen_once == TRUE)
PREPS_two <- PREPS  %>% filter(PREPS$seen_twice == TRUE)
PREPS_three <-  PREPS %>% filter(PREPS$seen_three == TRUE)
# Given data structure, should group two and three
PREPS_plus <- PREPS %>% filter(PREPS$seen_twice == TRUE & PREPS$seen_three == TRUE)

protein_df <- data.frame(unique(protein_df$master_protein))

## comparing all U20S to MCF10A
common <- intersect(PREPS_all$master_protein, protein_df$unique.protein_df.master_protein.)  
length(common)
# 1710. Proportion
print(length(common)/nrow(PREPS_all))

## comparing once U20S to MCF10A
common_one <- intersect(PREPS_one$master_protein, protein_df$unique.protein_df.master_protein.)  
length(common_one)
# 204. Proportion
print(length(common_one)/nrow(PREPS_one))

## comparing twice U20S to MCF10A
common_two <- intersect(PREPS_two$master_protein, protein_df$unique.protein_df.master_protein.)  
length(common_two)
# 1506. Proportion
print(length(common_two)/nrow(PREPS_two))

## comparing three U20S to MCF10A
common_three <- intersect(PREPS_three$master_protein, protein_df$unique.protein_df.master_protein.)  
length(common_three)
# 1208. Proportion
print(length(common_three)/nrow(PREPS_three))

t <- read.table(text = "PROP TOTAL
ALL 0.29 1710
ONE 0.28 204
TWO 0.69 1506
THREE 0.76 1208", header = TRUE)
               
plot(t$PROP, type = "l")
#Cannot make direct conclusions. Data collated in such a manner that proteins seen only 1 in ONCE. TWO and THREE are overlapping. 


venn.plot2 <-draw.quad.venn(2199, 723, 2177, 1679, 204, 1506, 1280, 0, 0, 1679, 0, 0, 1280, 0, 0, category = c("MCF10a", "U20S(1)", "U20S(2)", "U20S(3)"), lwd = rep(2, 4), lty = rep("solid", 4), col = rep("black", 4), fill = c("red", "blue", "green", "yellow"), alpha = rep(0.5, 4),label.col = rep("black", 15), cex = rep(1, 15),fontface = rep("plain", 15), fontfamily = rep("serif",15), cat.pos = c(-15, 15, 0, 0), cat.dist = c(0.22, 0.22, 0.11, 0.11), cat.col = rep("black", 4), cat.cex = rep(1, 4), cat.fontface = rep("plain", 4), cat.fontfamily = rep("serif", 4), cat.just = rep(list(c(0.5, 0.5)), 4), rotation.degree = 0, rotation.centre = c(0.5, 0.5), ind = TRUE, cex.prop = NULL, print.mode = "raw", sigdigs = 3, direct.area = FALSE, area.vector = 0)


grid.draw(venn.plot2)

## Want to analyze the "high-confidence" RBP

s <- intersect(protein_df$unique.protein_df.master_protein., PREPS_plus$master_protein)
...

```

1** MCF10A versus Veronica analysis 


```{r}
#Upload Veronica's data
peptide <- read.table("/Users/ciromonti/Documents/GitHub/ThreeTs/proteomics/trizol/MCF10A/raw/Veronica_TMT_10_Plex_F_PeptideGroups_sw_only_crap_checked.txt", sep="\t", header=T)
#change directory un RStudio to make data more readily accesible 
print(head(peptide))
print(dim(peptide))
print(colnames(peptide))
```
```{r}
#QC of the dataset before removing the Master.Proteins.
#Can use sprintf to print the the number of peptides lost at each step. 
#Remove the cRAP proteins
peptide <- peptide  %>% filter(peptide$crap_protein == 0)
#Remove the cRAP associated proteins
peptide <- peptide %>% filter(peptide$associated_crap_protein == 0)
#Keep proteins associted to a unique master protein
peptide <- peptide %>% filter(peptide$unique == 1)
dim(peptide)

peptide <- peptide[, c("Sequence", "master_protein")]
length(peptide$master_protein)
peptide <- data.frame(unique(peptide$master_protein))
#This is the proteomics background for MCF10A. Caveat these are not baseline conditions: insulin-starved or insulin-enriched. 
```

```{r}
OOPs_peptide <- read.table("/Users/ciromonti/Documents/GitHub/ThreeTs/proteomics/trizol/MCF10A/raw/MCF10A_sw_only_crap_checked.txt", sep="\t", header=T)
OOPs_peptide <- OOPs_peptide %>% filter(OOPs_peptide$crap_protein == 0)
OOPs_peptide <- OOPs_peptide %>% filter(OOPs_peptide$associated_crap_protein == 0)
OOPs_peptide <- OOPs_peptide %>% filter(OOPs_peptide$unique == 1)

  #Need to check syntax of making a continuous line
  #filter(OOPs_peptide$crap_protein == 0) %>%
  #filter(OOPs_peptide$associated_crap_protein == 0) %>% 
  #filter(OOPs_peptide$unique == 1)

OOPs_peptide <- OOPs_peptide[, c("Sequence", "master_protein")]
length(OOPs_peptide$master_protein)
OOPs_peptide <- data.frame(unique(OOPs_peptide$master_protein))
```

```{r}
#Intersection between the datasets 
overlap <- intersect(peptide$unique.peptide.master_protein., OOPs_peptide$unique.OOPs_peptide.master_protein.)  
length(overlap)
venn <- draw.pairwise.venn(area1 =length(peptide$unique.peptide.master_protein.), area2 = length(OOPs_peptide$unique.OOPs_peptide.master_protein.), cross.area = length(overlap), category = c("total", "OOPS"), 	fill = c("blue", "red"))
grid.draw(venn)
grid.newpage()

```


2. RBP function analysis following BECKMANN et al 2015 methods: GO terms contain following information: ‘mRNA’ ‘splic’ ‘RNA binding’ ‘RNA’ ‘RNP’ ‘translation’ ‘ribosom’ ‘nuclease' ‘exosome


```{r}
availableUniprotSpecies(pattern = "sapiens") #Find taxon ID for human species 
up <- UniProt.ws(taxId = 9606) #Generating a UniProt.ws object for Homo Sapiens

peptider_OOPS <- select(up,
              keys = OOPs_peptide$unique.OOPs_peptide.master_protein., 
              columns = c("GO-ID", "ENSEMBL", "GO"), 
              keystyle = "UNIPROTKB") #Retrieve GO ID and GO terms 
#Caution: dataset has expaneded with the transfer between IDs. From 2199 to 2391
x <- peptider_OOPS #fix this database 

toMatch <- c("mRNA", "splic", "RNA binding", "RNA", "RNP", "translation", "ribosom", "nuclease", "exosome")
known_RBP <- grep(paste(toMatch,collapse="|"), peptider_OOPS$GO, value = TRUE)
print(length(known_RBP))
index <- grep(paste(toMatch,collapse="|"), peptider_OOPS$GO, value = FALSE)

known <- peptider_OOPS[index, ] 
unknown <- peptider_OOPS[-index, ]

c <- c(length(known$UNIPROTKB), length(unknown$UNIPROTKB))
barplot(c, width = 0.5, xlab = "known versus unknown")
print(unknown$GO)

Match <- c("membrane", "junction", "cortex")
unknown_sp <- grep(paste(Match,collapse = "|"), unknown$GO, value = FALSE)
all_gly <- grep(paste(Match,collapse = "|"), peptider_OOPS$GO, value = FALSE)

#Finding prosite binding domains 
#Trying to find the glycoprotein enrichment with features component of UniProt.ws
#Only 263/2199 have a "glyco" feature, need to proceed with API. 
#Considering location 621/2199 at junction and/or membrane 
test <- select(up,
              keys = OOPs_peptide$unique.OOPs_peptide.master_protein., 
              columns = c("FEATURES", "SUBCELLULAR-LOCATIONS"), 
              keystyle = "UNIPROTKB") 
test_gly <- grep(pattern = "Glyco", test$FEATURES, value = FALSE, ignore.case = TRUE)
print(length(test_gly))
test_gly_loc <- grep(pattern = "membrane | junction", test$`SUBCELLULAR-LOCATIONS`, value = FALSE, ignore.case = TRUE)
print(length(test_gly_loc))
#Attempt with PTM for glycoproteins



```

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).
